ANOVA to compare sample means

Using ANOVA to compare GI for different noodles

lruolin
08-16-2021

Data

The data used below was sourced from [Easy Food Science for Statistics]https://www.sciencedirect.com/book/9780128142622/easy-statistics-for-food-science-with-r

Objective

Load Packages

library(pacman)
p_load(tidyverse, janitor, datapasta, ggthemes, broom, gridExtra, ggstatsplot,
       GGally)

Load Data

data <- tibble::tribble(
                ~Noodles, ~GI_1, ~GI_2, ~GI_3,
          "conventional", 52.83, 53.02, 52.85,
   "ripe cavendish pulp",  48.8, 49.05, 48.65,
   "ripe cavendish peel", 50.66, 50.64, 50.95,
  "green cavendish pulp", 48.41, 47.87, 48.63,
  "green cavendish peel", 51.49, 51.41, 48.09,
       "ripe dream pulp", 47.99, 47.73, 49.36,
       "ripe dream peel",  49.6, 49.36, 49.09,
      "green dream pulp", 47.26, 47.88,  47.7,
      "green dream peel", 52.52, 52.55, 52.54
  ) %>% 
    mutate(noodles_abbrev =factor(Noodles,
                                     levels = c("conventional",
                                     "ripe cavendish pulp",
                                     "ripe cavendish peel",
                                     "green cavendish pulp",
                                     "green cavendish peel",
                                     "ripe dream pulp",
                                     "ripe dream peel",
                                     "green dream pulp",
                                     "green dream peel"),
         labels = c("control", "rcpulp", "rcpeel", "gcpulp", "gcpeel",
                     "rdpulp", "rdpeel", "gdpulp",
                    "gdpeel"))) 

# data without control
data_no_control <- data %>% 
  filter(Noodles != "conventional") %>% 
    # create additional columns in case I want to look at effect of species/ripeness/part later
  mutate(noodles2 = Noodles) %>% 
  separate(noodles2, c("ripe_unripe", "species", "part"))

glimpse(data_no_control)
Rows: 8
Columns: 8
$ Noodles        <chr> "ripe cavendish pulp", "ripe cavendish peel",…
$ GI_1           <dbl> 48.80, 50.66, 48.41, 51.49, 47.99, 49.60, 47.…
$ GI_2           <dbl> 49.05, 50.64, 47.87, 51.41, 47.73, 49.36, 47.…
$ GI_3           <dbl> 48.65, 50.95, 48.63, 48.09, 49.36, 49.09, 47.…
$ noodles_abbrev <fct> rcpulp, rcpeel, gcpulp, gcpeel, rdpulp, rdpee…
$ ripe_unripe    <chr> "ripe", "ripe", "green", "green", "ripe", "ri…
$ species        <chr> "cavendish", "cavendish", "cavendish", "caven…
$ part           <chr> "pulp", "peel", "pulp", "peel", "pulp", "peel…

Visualization

data_long <- data %>% 
  pivot_longer(cols = starts_with("GI"),
               names_to = "reading",
               values_to = "gi")


glimpse(data_long)
Rows: 27
Columns: 4
$ Noodles        <chr> "conventional", "conventional", "conventional…
$ noodles_abbrev <fct> control, control, control, rcpulp, rcpulp, rc…
$ reading        <chr> "GI_1", "GI_2", "GI_3", "GI_1", "GI_2", "GI_3…
$ gi             <dbl> 52.83, 53.02, 52.85, 48.80, 49.05, 48.65, 50.…
summary(data_long)
   Noodles          noodles_abbrev   reading                gi       
 Length:27          control:3      Length:27          Min.   :47.26  
 Class :character   rcpulp :3      Class :character   1st Qu.:48.25  
 Mode  :character   rcpeel :3      Mode  :character   Median :49.36  
                    gcpulp :3                         Mean   :49.89  
                    gcpeel :3                         3rd Qu.:51.45  
                    rdpulp :3                         Max.   :53.02  
                    (Other):9                                        
fig_anova <- data_long%>% 
  group_by(Noodles) %>% 
  summarise(mean_gi = mean(gi)) %>% 
  ggplot(aes(x = fct_reorder(Noodles, mean_gi), y = mean_gi)) +
  geom_boxplot() +
  geom_text(aes(label = round(mean_gi, 2)), hjust = -0.25) +
  scale_y_continuous(limits = c(47, 55)) +
  labs(title = "Comparing GI of different noodles",
       subtitle = "Conventional noodles had the highest GI, and flours made with peel have higher GI than that from pulp.",
       x = "",
       y = "Mean GI (Average of Triplicates)",
       caption = "Source: Easy Statistics for Food Science with R") +
  coord_flip() +
  theme_clean() 
# 1 way anova:

m1_aov <- aov(gi ~ noodles_abbrev, data = data_long)
summary(m1_aov)
               Df Sum Sq Mean Sq F value   Pr(>F)    
noodles_abbrev  8  85.34  10.668   19.46 2.26e-07 ***
Residuals      18   9.87   0.548                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Post-hoc comparison

fig_post_hoc <- TukeyHSD(m1_aov, which = "noodles_abbrev", ordered = F) %>% 
  tidy() %>% 
  filter(adj.p.value<0.05) %>% 
  mutate(high_low = case_when(estimate >=0 ~ "higher",
                             TRUE ~ "lower")) %>% 
  ggplot(aes(x = fct_reorder(contrast, estimate), y = estimate)) +
  geom_point(size = 3, aes(col = high_low)) +
  geom_errorbar(aes(ymin = conf.low,
                    ymax = conf.high, col = high_low), width = 0.5) +
  coord_flip() +
  labs(title = "95% family-wise confidence level for groups with p<0.05",
       x = "",
       y = "Differences in mean levels of noodles") +
  theme_clean() +
  theme(legend.position = "none")

grid.arrange(fig_anova, fig_post_hoc)

The results below are slightly different due to different post-hoc analysis.

# another way, using ggstatsplot

ggbetweenstats(
  data = data_long,
  x = noodles_abbrev,
  y = gi,
  plot.type = "box",
  type = "p",
  package = "ggsci",
  palette = "nrc_npg",
  pairwise.display = "significant",
  title = "Comparing GI for different noodles"
)

Confounding?

Let’s look at the effect of ripeness, variety and part on GI.

glimpse(data_no_control)
Rows: 8
Columns: 8
$ Noodles        <chr> "ripe cavendish pulp", "ripe cavendish peel",…
$ GI_1           <dbl> 48.80, 50.66, 48.41, 51.49, 47.99, 49.60, 47.…
$ GI_2           <dbl> 49.05, 50.64, 47.87, 51.41, 47.73, 49.36, 47.…
$ GI_3           <dbl> 48.65, 50.95, 48.63, 48.09, 49.36, 49.09, 47.…
$ noodles_abbrev <fct> rcpulp, rcpeel, gcpulp, gcpeel, rdpulp, rdpee…
$ ripe_unripe    <chr> "ripe", "ripe", "green", "green", "ripe", "ri…
$ species        <chr> "cavendish", "cavendish", "cavendish", "caven…
$ part           <chr> "pulp", "peel", "pulp", "peel", "pulp", "peel…
data_long_no_control <- data_no_control %>% 
  pivot_longer(cols = starts_with("GI"),
               names_to = "replicates",
               values_to = "gi")

# main effect
m2_aov <- aov(gi ~ ripe_unripe + species + part, data = data_long_no_control)
summary(m2_aov) # residuals reduced compared to m2_aov
            Df Sum Sq Mean Sq F value   Pr(>F)    
ripe_unripe  1   0.83    0.83   0.612    0.443    
species      1   0.05    0.05   0.035    0.853    
part         1  36.43   36.43  26.767 4.62e-05 ***
Residuals   20  27.22    1.36                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# interaction

m3_aov <- aov(gi ~ ripe_unripe * species * part, data = data_long_no_control)
summary(m3_aov)
                         Df Sum Sq Mean Sq F value   Pr(>F)    
ripe_unripe               1   0.83    0.83   1.353  0.26181    
species                   1   0.05    0.05   0.078  0.78424    
part                      1  36.43   36.43  59.208 9.15e-07 ***
ripe_unripe:species       1   4.31    4.31   7.004  0.01760 *  
ripe_unripe:part          1   6.13    6.13   9.963  0.00611 ** 
species:part              1   1.46    1.46   2.365  0.14361    
ripe_unripe:species:part  1   5.48    5.48   8.909  0.00875 ** 
Residuals                16   9.85    0.62                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# testing the models

anova(m2_aov, m3_aov) # p -s significant so interaction term is not by chance. 
Analysis of Variance Table

Model 1: gi ~ ripe_unripe + species + part
Model 2: gi ~ ripe_unripe * species * part
  Res.Df     RSS Df Sum of Sq      F   Pr(>F)   
1     20 27.2226                                
2     16  9.8453  4    17.377 7.0602 0.001787 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Checking assumptions --

# to check for homogeneity of variance
library(car)
leveneTest(gi ~ ripe_unripe*species*part, data = data_long_no_control)  # p> 0.05, so equal variance can be assumed
Levene's Test for Homogeneity of Variance (center = median)
      Df F value Pr(>F)
group  7  0.7401 0.6424
      16               
# to check whether residuals are normally distributed
m3_aov$residuals %>% 
  as_tibble() %>% 
  ggplot(aes(value)) +
  geom_histogram() +
  theme_clean() # slightly right-skewed

From the ANOVA results, the part of banana used (pulp or peel) has an effect on GI.

There are also interactions between ripeness:species, ripeness: part.

Visualizing interaction

fig_a <- data_long_no_control %>% 
  ggplot(aes(x = ripe_unripe, y = gi)) +
  geom_boxplot(aes(fill = species)) +
  labs(title = "Ripeness:Species Interaction",
       x = "Green/Ripe",
       y = "GI noodles",
       fill = "Species") +
  theme_clean()

fig_b <- data_long_no_control %>% 
  ggplot(aes(x = ripe_unripe, y = gi)) +
  geom_boxplot(aes(fill = part)) +
  labs(title = "Ripeness:Part Interaction",
       x = "Green/Ripe",
       y = "GI",
       fill = "Part") +
  theme_clean()

fig_c <- data_long_no_control %>% 
  ggplot(aes(x = part, y = gi)) +
  geom_boxplot(aes(fill = species)) +
  labs(title = "Part:Species Interaction",
       x = "Part",
       y = "GI") +
  theme_clean()

grid.arrange(fig_a, fig_b, fig_c, ncol = 2)

Learning points

Citation

For attribution, please cite this work as

lruolin (2021, Aug. 16). pRactice corner: ANOVA to compare sample means. Retrieved from https://lruolin.github.io/myBlog/posts/20210815 ANOVA to compare means/

BibTeX citation

@misc{lruolin2021anova,
  author = {lruolin, },
  title = {pRactice corner: ANOVA to compare sample means},
  url = {https://lruolin.github.io/myBlog/posts/20210815 ANOVA to compare means/},
  year = {2021}
}